The rendered version is at /pdf_notebooks/04-NY_data_analysis.pdf
import pandas as pd
import numpy as np
import re
import math
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
from scipy.stats import spearmanr, chi2_contingency, pointbiserialr
from sklearn.cluster import KMeans
In this notebook, we analyse the US Traffic Accident dataset to derive insights and select features for predictive models.
df = pd.read_csv("data/NY_Accidents_2021_2022_Clean.csv")
df.shape
(35095, 40)
df["Start_Time"] = pd.to_datetime(df["Start_Time"])
df["End_Time"] = pd.to_datetime(df["End_Time"])
df["Date"] = pd.to_datetime(df["Date"])
numerical_vars = df.select_dtypes(include=['number']).columns.tolist()
boolean_vars = df.select_dtypes(include=['bool']).columns.tolist()
categorical_vars = df.select_dtypes(include=['object','category']).columns.tolist()
datetime_vars = df.select_dtypes(include=['datetime']).columns.tolist()
fig = make_subplots(rows=4, cols=4, subplot_titles=numerical_vars)
for i, col in enumerate(numerical_vars):
row = i // 4 + 1
col_pos = i % 4 + 1
fig.add_trace(
go.Histogram(x=df[col], nbinsx=25 if col != 'Severity' else 4, showlegend=False),
row=row, col=col_pos
)
# Update layout
fig.update_layout(height=700, width=1120, title_text="Histograms of Numerical Variables")
fig.update_xaxes(tickvals=[1, 2, 3, 4], row=1, col=1)
fig.update_xaxes(tickvals=list(range(1, 13)), row=4, col=3)
# Show plot
fig.show(config={'staticPlot': True})
true_counts = df[boolean_vars].sum()
num_plots = len(boolean_vars)
num_cols = 8
num_rows = math.ceil(num_plots / num_cols)
# Create Plotly figure with subplots
fig = make_subplots(rows=num_rows, cols=num_cols, subplot_titles=boolean_vars,
specs=[[{'type':'pie'}]*num_cols]*num_rows)
# Populate subplots with pie charts
for i, column in enumerate(boolean_vars):
row = i // num_cols + 1
col = i % num_cols + 1
counts = df[column].value_counts()
fig.add_trace(
go.Pie(labels=counts.index, values=counts, textinfo='percent', sort=False),
row=row, col=col
)
fig.update_layout(
title=f"{column}",
font=dict(size=10),
margin=dict(l=10, r=10, t=40, b=10), # Adjust margins for better layout
showlegend=False
)
# Update layout and show figure
fig.update_layout(
title='Distribution of Boolean Columns',
height=350, width=1000,
template='plotly_white',
)
fig.show(config={'staticPlot': True})
Some variables may not be very useful as they are so we would use their transformed version or new variables extracted from them.
categorical_vars.remove("Street")
categorical_vars.remove("State")
categorical_vars.remove("Weather_Condition")
# Create a subplot for Weather Category, Weather Intensity, and City
fig = make_subplots(rows=1, cols=3, subplot_titles=['Weather Category', 'City', 'County'])
# Plot Weather Category (bar plot)
weather_cat_counts = df['Weather_Category'].value_counts()
fig.add_trace(go.Bar(x=weather_cat_counts.index, y=weather_cat_counts.values, marker_color='orange'), row=1, col=1)
# Plot City (bar plot)
city_counts = df['City'].value_counts().nlargest(20)
fig.add_trace(go.Bar(x=city_counts.index, y=city_counts.values, marker_color='green'), row=1, col=2)
# Plot County (bar plot)
county_counts = df['County'].value_counts().nlargest(20)
fig.add_trace(go.Bar(x=county_counts.index, y=county_counts.values, marker_color='purple'), row=1, col=3)
fig.update_layout(title='Weather Category, Intensity, Top Cities and Top Counties',
height=340, width=900, showlegend=False)
fig.show(config={'staticPlot': True})
There is a very small amount of certain weather categories. We could group some of them as "other".
df["Weather_Category"] = df["Weather_Category"].replace({"snowstorm": "precipitation", "thunderstorm": "precipitation", "visibility Issue": "other", "extreme condition": "other"})
In this section we study the relationship between each relevant variable and the target Severity.
severity_colors = {1: '#1f77b4', 2: '#9467bd', 3: '#ff7f0e', 4: '#d62728'}
coordinate_vars = ["Start_Lng", "Start_Lat"]
%%time
df['Severity'] = df['Severity'].astype(str)
fig = px.scatter(df.sort_values(by=["Severity"]), x='Start_Lng', y='Start_Lat', color='Severity', opacity=0.3, color_discrete_sequence=[v for v in severity_colors.values()])
fig.update_layout(title='Start_Lat vs Start_Lng', height=450, width=650)
fig.show(config={'staticPlot': True})
df['Severity'] = df['Severity'].astype(int)
CPU times: total: 156 ms Wall time: 166 ms
We can observe that most accidents occured in the most populated areas such as New York City. We also observe the less severe accidents along interstate highways.
df["Start_Lat"].unique().shape[0], df["Start_Lng"].unique().shape[0]
(21235, 21201)
Since Start_lat and Start_Lng cover a wide range of values, we can cluster the points to match different desnities.
kmeans = KMeans(n_clusters=18, random_state=0).fit(df[['Start_Lat', 'Start_Lng']])
df['Location_Cluster'] = kmeans.labels_
%%time
df['Location_Cluster'] = df['Location_Cluster'].astype(str)
fig = px.scatter(df.sort_values(by=["Location_Cluster"]), x='Start_Lng', y='Start_Lat', color='Location_Cluster', opacity=0.5)
fig.update_layout(title='Location Clusters', height=450, width=650)
fig.show(config={'staticPlot': True})
df['Location_Cluster'] = df['Location_Cluster'].astype(int)
CPU times: total: 141 ms Wall time: 203 ms
categorical_vars.append("Location_Cluster")
time_vars = ['Hour', 'Day_of_Week', 'Day', 'Month']
severities = sorted(df['Severity'].unique())
# Plot time variables
fig = make_subplots(rows=2, cols=2, subplot_titles=[f'{var}' for var in time_vars])
for i, var in enumerate(time_vars):
row = i // 2 + 1
col = i % 2 + 1
var_counts = df[var].value_counts().sort_index()
for severity in severities:
df_grouped = df[df['Severity'] == severity][var].value_counts()
df_grouped = df_grouped.div(var_counts, fill_value=float('NaN'))
fig.add_trace(go.Bar(x=df_grouped.index, y=df_grouped.values * 100, name=f'Severity {severity}',
hovertemplate=f'Severity {severity}: {{y:.2f}}%',
marker=dict(color=severity_colors[severity]),
showlegend=(i == 0), opacity=0.8
), row=row, col=col)
fig.update_yaxes(title_text='Proportion (%)', row=1, col=i+1)
fig.update_layout(height=500, width=1000, title='Temporal Variables vs Severity', barmode='stack', showlegend=True)
fig.show(config={'staticPlot': True})
num_vars = [var for var in numerical_vars if var not in time_vars and var not in ordinal_vars and var not in coordinate_vars]
num_vars.remove("Year")
num_vars.remove("Severity")
fig = make_subplots(rows=2, cols=4, subplot_titles=num_vars)
for i, var in enumerate(num_vars):
row = i // 4 + 1
col = i % 4 + 1
fig.add_trace(go.Box(
y=df[var], x=df['Severity'],
name=var,
boxpoints="suspectedoutliers"
), row=row, col=col)
fig.update_layout(height=650, width=1100, title='Numerical Variables vs Severity', showlegend=False)
fig.show(config={'staticPlot': True})
To simplify our analysis, we decided to drop the "City" and "County" variables due to their high cardinality and the lack of additional context such as population size. This makes them less useful for our current scope. By excluding them, we can focus on more immediately actionable and interpretable variables.
categorical_vars.remove("City")
categorical_vars.remove("County")
severities = sorted(df['Severity'].unique())
# Plot time variables
fig = make_subplots(rows=1, cols=2, subplot_titles=[f'{var}' for var in categorical_vars], column_widths=[0.3, 0.7], horizontal_spacing = 0.15)
for i, var in enumerate(categorical_vars):
row = i // 2 + 1
col = i % 2 + 1
var_counts = df[var].value_counts().sort_index()
for severity in severities:
df_grouped = df[df['Severity'] == severity][var].value_counts()
df_grouped = df_grouped.div(var_counts, fill_value=float('NaN'))
fig.add_trace(go.Bar(x=df_grouped.index, y=df_grouped.values * 100, name=f'Severity {severity}',
hovertemplate=f'Severity {severity}: {{y:.2f}}%',
marker=dict(color=severity_colors[severity]),
showlegend=(i == 0), opacity=0.8
), row=row, col=col)
fig.update_yaxes(title_text='Proportion (%)', row=1, col=i+1)
fig.update_layout(height=350, width=750, title='Categorical Variables vs Severity', barmode='stack', showlegend=True)
fig.show(config={'staticPlot': True})
severities = sorted(df['Severity'].unique())
# Plot time variables
fig = make_subplots(rows=2, cols=8, subplot_titles=[f'{var}' for var in boolean_vars], horizontal_spacing=0.09, vertical_spacing=0.25)
for i, var in enumerate(boolean_vars):
row = i // 8 + 1
col = i % 8 + 1
var_counts = df[var].value_counts().sort_index()
for severity in severities:
df_grouped = df[df['Severity'] == severity][var].value_counts()
df_grouped = df_grouped.div(var_counts, fill_value=float('NaN'))
fig.add_trace(go.Bar(x=df_grouped.index, y=df_grouped.values * 100, name=f'Severity {severity}',
hovertemplate=f'Severity {severity}: {{y:.2f}}%',
marker=dict(color=severity_colors[severity]),
showlegend=(i == 0), opacity=0.8
), row=row, col=col)
fig.update_yaxes(title_text='Proportion (%)', row=1, col=i+1)
fig.update_layout(
height=500, width=1200,
title='Categorical Variables vs Severity',
barmode='stack',
autosize=True, showlegend=True)
fig.show(config={'staticPlot': True})
# Store results
correlations = {'Variable': [], 'Correlation': [], 'Type': []}
# Spearman's Rank Correlation for numerical and time-related variables
for var in num_vars + coordinate_vars + time_vars:
corr, _ = spearmanr(df[var], df['Severity'])
correlations['Variable'].append(var)
correlations['Correlation'].append(corr)
correlations['Type'].append('Spearman\'s Rank')
# Chi-Square and Cramér's V for categorical variables
for var in categorical_vars:
contingency_table = pd.crosstab(df[var], df['Severity'])
chi2, p, dof, expected = chi2_contingency(contingency_table)
cramers_v = np.sqrt(chi2 / (df.shape[0] * (min(contingency_table.shape) - 1)))
correlations['Variable'].append(var)
correlations['Correlation'].append(cramers_v)
correlations['Type'].append('Chi-Square + Cramer\'s V')
# Point-Biserial Correlation for boolean variables
for var in boolean_vars:
corr, _ = pointbiserialr(df[var], df['Severity'])
correlations['Variable'].append(var)
correlations['Correlation'].append(corr)
correlations['Type'].append('Point-Biserial')
C:\Users\ngoum\anaconda3\envs\dl-env\lib\site-packages\scipy\stats\stats.py:4023: PearsonRConstantInputWarning: An input array is constant; the correlation coefficient is not defined.
# Convert to DataFrame
correlations_df = pd.DataFrame(correlations)
fig = go.Figure()
# Iterate over unique types to create grouped bars
for type_name in correlations_df['Type'].unique():
df_filtered = correlations_df[correlations_df['Type'] == type_name]
fig.add_trace(
go.Bar(x=df_filtered['Variable'], y=df_filtered['Correlation'], name=type_name)
)
# Update layout
fig.update_layout(
title='Strength of Relationships with Severity',
xaxis_title='Variable',
yaxis_title='Correlation Coefficient',
xaxis_tickangle=90,
legend_title='Type',
legend=dict(x=1, y=1, traceorder='normal'),
barmode='group',
height=600, width=1100
)
fig.show(config={'staticPlot': True})
correlations_df[np.abs(correlations_df["Correlation"]) >= 0.1]
| Variable | Correlation | Type | |
|---|---|---|---|
| 0 | Distance(mi) | -0.354626 | Spearman's Rank |
| 7 | Duration(min) | -0.238717 | Spearman's Rank |
| 15 | Location_Cluster | 0.247834 | Chi-Square + Cramer's V |
correlations_df
| Variable | Correlation | Type | |
|---|---|---|---|
| 0 | Distance(mi) | -0.354626 | Spearman's Rank |
| 1 | Temperature(F) | -0.067355 | Spearman's Rank |
| 2 | Humidity(%) | -0.024080 | Spearman's Rank |
| 3 | Pressure(in) | 0.014920 | Spearman's Rank |
| 4 | Visibility(mi) | -0.004877 | Spearman's Rank |
| 5 | Wind_Speed(mph) | 0.021117 | Spearman's Rank |
| 6 | Precipitation(in) | 0.001560 | Spearman's Rank |
| 7 | Duration(min) | -0.238717 | Spearman's Rank |
| 8 | Start_Lng | -0.046251 | Spearman's Rank |
| 9 | Start_Lat | 0.004445 | Spearman's Rank |
| 10 | Hour | 0.065942 | Spearman's Rank |
| 11 | Day_of_Week | 0.003503 | Spearman's Rank |
| 12 | Day | 0.015956 | Spearman's Rank |
| 13 | Month | -0.074053 | Spearman's Rank |
| 14 | Weather_Category | 0.029170 | Chi-Square + Cramer's V |
| 15 | Location_Cluster | 0.247834 | Chi-Square + Cramer's V |
| 16 | Amenity | 0.011462 | Point-Biserial |
| 17 | Bump | 0.019960 | Point-Biserial |
| 18 | Crossing | -0.037391 | Point-Biserial |
| 19 | Give_Way | -0.016632 | Point-Biserial |
| 20 | Junction | -0.016389 | Point-Biserial |
| 21 | No_Exit | -0.019821 | Point-Biserial |
| 22 | Railway | 0.003542 | Point-Biserial |
| 23 | Roundabout | NaN | Point-Biserial |
| 24 | Station | -0.023069 | Point-Biserial |
| 25 | Stop | 0.000131 | Point-Biserial |
| 26 | Traffic_Calming | 0.041375 | Point-Biserial |
| 27 | Traffic_Signal | -0.034158 | Point-Biserial |
| 28 | Turning_Loop | NaN | Point-Biserial |
| 29 | Is_Night | 0.075315 | Point-Biserial |
| 30 | Is_Highway | 0.000779 | Point-Biserial |